package com.kdtech.analyse.news;
import com.kdtech.utils.DateUtils;
import com.kdtech.analyse.AnalyseNews;
import com.kdtech.utils.HtmlCleaner;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import com.kdtech.analyse.JSoupUtils;
import com.kdtech.crawler.CrawlHTML;
import com.kdtech.entity.crawler.UrlMeta;
import com.kdtech.entity.data.NewsMeta;
import com.kdtech.utils.StringUtils;
import com.kdtech.utils.HtmlCleaner;

public class ChinaComCnNewsAnalyse implements AnalyseNews {

	
	public boolean isDetailPage(String url) {
		// http://home.china.com.cn/chinamodule/module/2012-11-23/658929.shtml
		// http://finance.china.com.cn/consume/syal/20121123/1150392.shtml
		// http://www.china.com.cn/renkou/2012-11/06/content_27018983.htm
		// http://edu.china.com.cn/2012-11/23/content_27202515.htm
		// http://invest.china.com.cn/wwwroot/c_000000030001/d_80816.html
		// http://house.china.com.cn/home/view/596796.htm
		// http://house.china.com.cn/newhouse/newsview596910.htm
		// http://y.china.com.cn/info/187827
		// http://f.home.china.com.cn/201211/46-a14901.html
		//http://www.china.com.cn/info/zhuanti/09kaoyan/2010-03/09/content_19565193.htm
		boolean bRet=false;
		String[] regex={
				"http://finance[.]china[.]com[.]cn/.*[0-9]{8}/[0-9]*[.]shtml",
				"http://.*[.]china[.]com[.]cn/.*[0-9]{4}-[0-9]{2}/[0-9]{2}/content_[0-9]*[.]htm",
				"http://.*[.]china[.]com[.]cn/wwwroot/[a-z]{1}_[0-9]*/[a-z]{1}_[0-9]*[.]html",
				"http://.*[.]china[.]com[.]cn/.*/view/[0-9]+[\\-0-9]*[.]htm",
				"http://.*[.]china.com.cn/.*/photo/[0-9]+/[0-9]+.shtml",
				"http://.*[.]china.com.cn/.*/photo/[0-9]+/[0-9]+.shtml\\?pic=[0-9]*",
				"http://.*[.]china[.]com[.]cn/.*/newsview[0-9]+[\\-0-9]*[.]htm",
				"http://.*[.]china.com.cn/news/.*/[0-9]*-[0-9]*-[0-9]*/[0-9]+[\\-0-9]*.html",
				"http://.*[.]china[.]com[.]cn/info/[0-9]*",
				"http://.*.china.com.cn/html/.*/sp/[0-9]+_1.html",
				"http://.*[.]china[.]com[.]cn/.*[0-9]{6}/[0-9]*-[a-z]{0,1}[0-9]*.html",
				"http://henan.china.com.cn/html/[a-z]+/[0-9]{4}/[0-9]{4}/[0-9]+.html",
				"http://henan.china.com.cn/[a-z]+/[0-9]*/[0-9]*/[0-9]*.shtml"
				};

		for (int i=0; i < regex.length; i++) {
			if (url.matches(regex[i])) {
				bRet=true;
				break;
			}
		}
		return bRet;
	}

	
	public NewsMeta parserHtml(UrlMeta urlMeta) {
		String html=urlMeta.getHtml();
		String url=urlMeta.getUrl();
		String title="";
		String content="";
		String author="";
		Long date=0l;
		int commentNum=0;
		int clickNum=0;
		NewsMeta meta=new NewsMeta();
		meta.setUrl(urlMeta.getUrl());
		meta.setType(0);
		Document doc=Jsoup.parse(html);
		doc.select("[style=display:none]").remove();
		doc.select("div.related,div.blog,div.Content_Two,div.Content_Three,div.BX_page").remove();
		/*
		 * 解析新闻标题
		 */

		title=JSoupUtils.select(doc, "#newsTitle",".artTitle","div.tittle",".left_content h1","div.title","div.wb_title","div.content_t>div>strong","td.fb24","td.word-24-bt","div.boxMainLeft>h2","#main_left_title","td.a8","td.fb21",".box18  dt","h1", "td.word-20");

		/*
		 * 解析时间
		 */
		date=DateUtils.matchDate(doc.select("span").text());
		if (date == null) date=JSoupUtils.matchDate(doc,"时间：","发布时间","中国网");

		/*
		 * 解析内容
		 */
		if(urlMeta.getUrl().indexOf("/photo/")==-1)
		content =HtmlCleaner.getContentHtml(url, doc.select("div.show-content p"));
		if (StringUtils.isBlank(content)) {
		content=JSoupUtils.selectContent(url,doc, "#textnews","#fontzoom","#content","#p_content","#articleContent","#artibody","#artbody","div.content","div.wb_nr","#box3","td.word-lm-14","div.boxMainLeft>p", "td.word-14");
		};/*
		 * 解析作者
		 */
		author=JSoupUtils.matchAuthor(doc, "来源：");
		meta.setTitle(title);
		meta.setContent(content);
		meta.setAuthor(author);
		meta.setDate(date);
		meta.setClickNum(clickNum);
		meta.setCommentNum(commentNum);

		return meta;
	}

	public static void main(String[] args) {
		String url="http://henan.china.com.cn/news/2016/1108/3673213.shtml";

		ChinaComCnNewsAnalyse cnhubeiNewsAnalyse=new ChinaComCnNewsAnalyse();
		boolean detailPage=cnhubeiNewsAnalyse.isDetailPage(url);
		if (detailPage) {
			UrlMeta urlMeta=CrawlHTML.responseToURL(url);
			NewsMeta parserHtml=cnhubeiNewsAnalyse.parserHtml(urlMeta);
			System.out.println(parserHtml);
		} else {
			System.out.println("不符合正则");
		}
	}

	
	public NewsMeta Update(NewsMeta meta) {

		return null;
	}
	
	public boolean isNeedUpdate(){
		return false;
	}
}
